Code
library(dplyr)
library(knitr)
library(limma)
library(DT)
library(readr)
library(knitr)
library(tidyr)
library(ggplot2)
library(ggcorrplot)library(dplyr)
library(knitr)
library(limma)
library(DT)
library(readr)
library(knitr)
library(tidyr)
library(ggplot2)
library(ggcorrplot)We selected SLEDAI, C3, and C4 as our core clinical indicators for risk grouping and downstream differential expression analysis based on their biological importance and widespread use in lupus research and clinical practice.
The table below summarizes the full names, biological functions, cutoff thresholds, and supporting references of each variable:
| Variable | Full Name | Biological Role | Cutoff Threshold | Reference |
| SLEDAI | Systemic Lupus Erythematosus Disease Activity Index | Measures lupus disease activity. Higher scores indicate more active disease. | ≥ 10 (High Risk) | Bombardier et al., 1992; Petri et al., 2005 |
| C3 | Complement Component 3 | Immune protein; lower levels indicate increased immune complex activation. | < 0.9 g/L (High Risk) | Walport, 2001; Yang et al., 2020 |
| C4 | Complement Component 4 | Works with C3; its reduction also reflects lupus disease activity. | < 0.1 g/L (High Risk) | Walport, 2001; Yang et al., 2020 |
# View NA value
na_summary <- data.frame(
Variable = c("SLEDAI", "C3", "C4"),
NA_Count = c(
sum(is.na(pheno_clean$sledai_at_baseline)),
sum(is.na(pheno_clean$c3)),
sum(is.na(pheno_clean$c4))
)
)
datatable(na_summary,
caption = "NA Count Summary for Clinical Variables",
options = list(pageLength = 5, autoWidth = TRUE))#Replace NA value with median to ensure dimensional consistency.
median_sledai <- median(pheno_clean$sledai_at_baseline, na.rm = TRUE)
median_c3 <- median(pheno_clean$c3, na.rm = TRUE)
median_c4 <- median(pheno_clean$c4, na.rm = TRUE)
pheno_clean <- pheno_clean %>%
mutate(
sledai_at_baseline = ifelse(is.na(sledai_at_baseline), median_sledai, sledai_at_baseline),
c3 = ifelse(is.na(c3), median_c3, c3),
c4 = ifelse(is.na(c4), median_c4, c4)
)
summary_df <- as.data.frame(summary(pheno_clean))
datatable(summary_df,
caption = "Summary Statistics After NA Imputation",
options = list(pageLength = 10, autoWidth = TRUE))pheno_long <- pheno_clean %>%
select(SLEDAI = sledai_at_baseline, C3 = c3, C4 = c4) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")
ggplot(pheno_long, aes(x = Value)) +
geom_histogram(bins = 30, fill = "#0073C2FF", color = "white") +
facet_wrap(~Variable, scales = "free") +
labs(title = "Distribution of Clinical Variables",
x = "Value",
y = "Frequency") +
theme_minimal()ggplot(pheno_long, aes(x = "", y = Value)) +
geom_boxplot(fill = "#f9844a") +
facet_wrap(~Variable, scales = "free_y") +
labs(title = "Boxplot of Clinical Variables ",
x = "",
y = "Value") +
theme_minimal()cor_vars <- pheno_clean %>%
select(SLEDAI = sledai_at_baseline, C3 = c3, C4 = c4)
cor_matrix <- cor(cor_vars, use = "complete.obs", method = "pearson")
ggcorrplot(cor_matrix,
lab = TRUE,
type = "lower",
lab_size = 4,
title = "Correlation Matrix of Clinical Variables",
colors = c("#B2182B", "white", "#2166AC"))if("sex" %in% names(pheno_clean)) {
sex_table <- as.data.frame(table(pheno_clean$sex))
colnames(sex_table) <- c("Sex", "Count")
datatable(sex_table,
caption = "Sex Distribution in the Sample",
options = list(pageLength = 5, autoWidth = TRUE))
}if("age" %in% names(pheno_clean)) {
ggplot(pheno_clean, aes(x = age)) +
geom_histogram(bins = 30, fill = "#00AFBB", color = "white") +
labs(title = "Age Distribution", x = "Age", y = "Frequency") +
theme_minimal()
}#Create risk classification based on different clinical indicators
pheno_clean$risk_group_sledai <- ifelse(
pheno_clean$sledai_at_baseline >= 10, "HighRisk_SLEDAI", "LowRisk_SLEDAI"
)
pheno_clean$risk_group_c3 <- ifelse(
pheno_clean$c3 < 0.9, "HighRisk_C3", "LowRisk_C3"
)
pheno_clean$risk_group_c4 <- ifelse(
pheno_clean$c4 < 0.1, "HighRisk_C4", "LowRisk_C4"
)
#Calculate the sample size for each risk group
df_sledai = as_tibble(table(pheno_clean$risk_group_sledai), .name_repair = "minimal")
df_c3 = as_tibble(table(pheno_clean$risk_group_c3), .name_repair = "minimal")
df_c4 = as_tibble(table(pheno_clean$risk_group_c4), .name_repair = "minimal")
colnames(df_sledai) = c("Group", "Count_SLEDAI")
colnames(df_c3) = c("Group", "Count_C3")
colnames(df_c4) = c("Group", "Count_C4")
df_combined_sledai_c3 = full_join(df_sledai, df_c3, by = "Group")
df_combined = full_join(df_combined_sledai_c3, df_c4, by = "Group")
kable(df_combined, caption = "Risk Group Statistics Based on SLEDAI, C3, and C4 Levels")| Group | Count_SLEDAI | Count_C3 | Count_C4 |
|---|---|---|---|
| HighRisk_SLEDAI | 1073 | NA | NA |
| LowRisk_SLEDAI | 747 | NA | NA |
| HighRisk_C3 | NA | 650 | NA |
| LowRisk_C3 | NA | 1170 | NA |
| HighRisk_C4 | NA | NA | 384 |
| LowRisk_C4 | NA | NA | 1436 |
threshold_sledai <- 10
pheno_clean$risk_group_SLEDAI <- ifelse(pheno_clean$sledai_at_baseline >= threshold_sledai, "HighRisk_SLEDAI", "LowRisk_SLEDAI")
design_sledai <- model.matrix(~ pheno_clean$risk_group_SLEDAI)
#Fit limma model
fit_sledai <- lmFit(expr_df_matched, design_sledai)
#Applying eBayes method
fit_sledai <- eBayes(fit_sledai)
#Extract differentially expressed genes
results_sledai <- topTable(fit_sledai, coef=2, n=Inf)
results_sledai_filtered <- results_sledai[results_sledai$adj.P.Val < 0.05, ]
#Create an interactive table using the DT package
datatable(round(results_sledai_filtered, 2),
options = list(pageLength = 10, autoWidth = TRUE),
caption = "Significant Differentially Expressed Genes in SLEDAI (p < 0.05)")threshold_c3 <- 0.9
pheno_clean$risk_group_C3 <- ifelse(pheno_clean$c3 < threshold_c3, "HighRisk_C3", "LowRisk_C3")
design_c3 <- model.matrix(~ pheno_clean$risk_group_C3)
fit_c3 <- lmFit(expr_df_matched, design_c3)
fit_c3 <- eBayes(fit_c3)
results_c3 <- topTable(fit_c3, coef=2, n=Inf)
results_c3_filtered <- results_c3[results_c3$adj.P.Val < 0.05, ]
datatable(round(results_c3_filtered, 2),
options = list(pageLength = 10, autoWidth = TRUE),
caption = "Significant Differentially Expressed Genes in C3 Analysis (p < 0.05)")# 假设C4得分阈值是0.1
threshold_c4 <- 0.1
# 创建风险分组列
pheno_clean$risk_group_C4 <- ifelse(pheno_clean$c4 < threshold_c4, "HighRisk_C4", "LowRisk_C4")
# 为 C4 风险分组设置设计矩阵
design_c4 <- model.matrix(~ pheno_clean$risk_group_C4)
# 拟合 limma 模型
fit_c4 <- lmFit(expr_df_matched, design_c4)
# 应用 eBayes 方法
fit_c4 <- eBayes(fit_c4)
# 提取差异表达的基因
results_c4 <- topTable(fit_c4, coef=2, n=Inf) # 提取所有基因
results_c4_filtered <- results_c4[results_c4$adj.P.Val < 0.05, ] # 筛选 p 值小于 0.05 的基因
# 使用 DT 包来创建一个交互式表格
datatable(round(results_c4_filtered, 2),
options = list(pageLength = 10, autoWidth = TRUE),
caption = "Significant Differentially Expressed Genes in C4 Analysis (p < 0.05)")Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
#Screen genes with p-values less than 0.05 in each analysis group
significant_sledai <- results_sledai[results_sledai$adj.P.Val < 0.05,]
significant_c3 <- results_c3[results_c3$adj.P.Val < 0.05,]
significant_c4 <- results_c4[results_c4$adj.P.Val < 0.05,]
#Find the intersection using gene names
common_genes <- Reduce(intersect, list(rownames(significant_sledai),
rownames(significant_c3),
rownames(significant_c4)))
#Can create a data box containing these common genetic information
common_genes_data <- data.frame(
Gene = common_genes,
PVal_SLEDAI = significant_sledai[common_genes, "adj.P.Val", drop = FALSE],
PVal_C3 = significant_c3[common_genes, "adj.P.Val", drop = FALSE],
PVal_C4 = significant_c4[common_genes, "adj.P.Val", drop = FALSE]
)
datatable(common_genes_data,
options = list(pageLength = 10, autoWidth = TRUE),
caption = "Common Significant Genes Across SLEDAI, C3, and C4 Analyses")# 保存筛选后的基因数据到 CSV 文件
write.csv(common_genes_data, "Common_Significant_Genes.csv", row.names = FALSE)results_sledai$Gene <- rownames(results_sledai)
results_sledai$Time <- "SLEDAI"
results_c3$Gene <- rownames(results_c3)
results_c3$Time <- "C3"
results_c4$Gene <- rownames(results_c4)
results_c4$Time <- "C4"
volcano_all <- rbind(results_sledai, results_c3, results_c4)
volcano_all$IsCommon <- ifelse(volcano_all$Gene %in% common_genes, "Common", "NotCommon")
volcano_common <- subset(volcano_all, IsCommon == "Common")
library(ggplot2)
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
p_common <- ggplot(volcano_common, aes(x = logFC, y = -log10(P.Value))) +
geom_point(aes(color = Time, text = Gene), size = 1.5, alpha = 0.8) +
facet_wrap(~ Time) +
labs(x = "logFC", y = "-log10(p value)", title = "Common Significant Genes") +
theme_minimal()Warning in geom_point(aes(color = Time, text = Gene), size = 1.5, alpha = 0.8):
Ignoring unknown aesthetics: text
p_interactive <- ggplotly(p_common, tooltip = "text")
p_interactivep_ma <- ggplot(volcano_common, aes(x = AveExpr, y = logFC)) +
geom_point(aes(color = Time, text = Gene), size = 1.5, alpha = 0.8) +
facet_wrap(~ Time) +
labs(x = "Average Expression (log2)", y = "log2 Fold Change", title = "MA Plot - Common DEGs") +
theme_minimal()Warning in geom_point(aes(color = Time, text = Gene), size = 1.5, alpha = 0.8):
Ignoring unknown aesthetics: text
p_ma_interactive <- ggplotly(p_ma, tooltip = "text")
p_ma_interactive